llama.cpp on Tesla K80
依存するパッケージをいれる
code:sh
sudo apt install build-essential libcurl4-openssl-dev
ubuntu 20.04 のcmakeが古いので、PPAから新しめのcmakeをいれる
code:sh
sudo add-apt-repository ppa:ecal/cmake-3.25 && sudo apt update && sudo apt install cmake
cloneする
code:sh
cd llama.cpp
ビルドする
code:sh
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-11.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES="37;60"
code:sh
cmake --build build --config Release -- -j $(nproc)
使用するモデル
P100とK80を混載した状態で実行している
unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF:Q4_K_M
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 28
出力がとまらなくなった
unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M on P100
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 33 --split-mode none --device CUDA0
code:txt
llama_perf_sampler_print: sampling time = 118.36 ms / 1312 runs ( 0.09 ms per token, 11084.92 tokens per second)
llama_perf_context_print: load time = 2034.37 ms
llama_perf_context_print: prompt eval time = 57.96 ms / 5 tokens ( 11.59 ms per token, 86.27 tokens per second)
llama_perf_context_print: eval time = 38106.72 ms / 1306 runs ( 29.18 ms per token, 34.27 tokens per second)
llama_perf_context_print: total time = 38540.25 ms / 1311 tokens
unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M on K80
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 33 --split-mode none --device CUDA1
code:txt
llama_perf_sampler_print: sampling time = 229.43 ms / 2524 runs ( 0.09 ms per token, 11001.37 tokens per second)
llama_perf_context_print: load time = 2248.09 ms
llama_perf_context_print: prompt eval time = 362.83 ms / 5 tokens ( 72.57 ms per token, 13.78 tokens per second)
llama_perf_context_print: eval time = 294234.25 ms / 2518 runs ( 116.85 ms per token, 8.56 tokens per second)
llama_perf_context_print: total time = 295313.18 ms / 2523 tokens
unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M on P100 * 1 + K80 * 2, テンソル行分割
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 33 --split-mode row --main-gpu 0
code:txt
llama_perf_sampler_print: sampling time = 219.72 ms / 2369 runs ( 0.09 ms per token, 10782.00 tokens per second)
llama_perf_context_print: load time = 3306.18 ms
llama_perf_context_print: prompt eval time = 160.16 ms / 5 tokens ( 32.03 ms per token, 31.22 tokens per second)
llama_perf_context_print: eval time = 181715.05 ms / 2363 runs ( 76.90 ms per token, 13.00 tokens per second)
llama_perf_context_print: total time = 182553.66 ms / 2368 tokens
unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M on P100 * 1 + K80 * 2, テンソルレイヤー分割
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 81 --split-mode layer
code:txt
llama_perf_sampler_print: sampling time = 147.50 ms / 1422 runs ( 0.10 ms per token, 9640.35 tokens per second)
llama_perf_context_print: load time = 163676.09 ms
llama_perf_context_print: prompt eval time = 2180.37 ms / 5 tokens ( 436.07 ms per token, 2.29 tokens per second)
llama_perf_context_print: eval time = 896793.71 ms / 1416 runs ( 633.33 ms per token, 1.58 tokens per second)
llama_perf_context_print: total time = 899432.42 ms / 1421 tokens
unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M on P100 * 1 + K80 * 2, テンソル行分割
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 81 --split-mode row --main-gpu 0
code:sh
llama_perf_sampler_print: sampling time = 128.97 ms / 1298 runs ( 0.10 ms per token, 10064.36 tokens per second)
llama_perf_context_print: load time = 15308.25 ms
llama_perf_context_print: prompt eval time = 887.35 ms / 5 tokens ( 177.47 ms per token, 5.63 tokens per second)
llama_perf_context_print: eval time = 377637.21 ms / 1292 runs ( 292.29 ms per token, 3.42 tokens per second)
llama_perf_context_print: total time = 378931.94 ms / 1297 tokens
unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M on K80 * 4, テンソルレイヤー分割
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 81 --split-mode layer
code:txt
llama_perf_sampler_print: sampling time = 97.03 ms / 660 runs ( 0.15 ms per token, 6801.74 tokens per second)
llama_perf_context_print: load time = 18710.21 ms
llama_perf_context_print: prompt eval time = 3129.01 ms / 5 tokens ( 625.80 ms per token, 1.60 tokens per second)
llama_perf_context_print: eval time = 550150.46 ms / 654 runs ( 841.21 ms per token, 1.19 tokens per second)
llama_perf_context_print: total time = 553598.14 ms / 659 tokens
unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M on K80 * 4, テンソル行分割
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF:Q4_K_M -no-cnv --prompt "Once upon a time" -ngl 81 --split-mode row --main-gpu 0
code:txt
CUDA error: out of memory
current device: 4, in function alloc at /home/ubuntu/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:451
cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)
うごかなかった
unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M on K80 * 2, テンソルレイヤー分割
code:sh
./build/bin/llama-cli -hf llama-cli -hf unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M -no-cnv --prompt "自己紹介してください" -ngl 65 --split-mode layer
code:txt
llama_perf_sampler_print: sampling time = 86.76 ms / 727 runs ( 0.12 ms per token, 8379.15 tokens per second)
llama_perf_context_print: load time = 7588.13 ms
llama_perf_context_print: prompt eval time = 1688.43 ms / 6 tokens ( 281.40 ms per token, 3.55 tokens per second)
llama_perf_context_print: eval time = 282065.38 ms / 720 runs ( 391.76 ms per token, 2.55 tokens per second)
llama_perf_context_print: total time = 284024.55 ms / 726 tokens
unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M on K80 * 2, テンソル行分割
code:sh
./build/bin/llama-cli -hf llama-cli -hf unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M -no-cnv --prompt "自己紹介してください" -ngl 65 --split-mode row
code:txt
発狂した
ログ
# Load and run the model:
llama-cli -hf unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:F16
DeepSeek-R1
unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:F16
code:sh
./build/bin/llama-cli -hf unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF:F16 --single-turn --n-gpu-layers 100 --split-mode layer --prompt "こんにちは"
Qwen3
unsloth/Qwen3-32B-GGUF:BF16
code:sh
./build/bin/llama-cli -hf unsloth/Qwen3-32B-GGUF:BF16 -no-cnv --prompt "「銀河超特急の夜」というタイトルで宮沢賢治風の童話を書いてください。" -ngl 100 --split-mode layer
code:txt
/home/ubuntu/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:75: CUDA error
CUDA error: CUBLAS_STATUS_ARCH_MISMATCH
current device: 0, in function ggml_cuda_op_mul_mat_cublas at /home/ubuntu/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:1220
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, src1_ptr, CUDA_R_16BF, ne10, &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
unsloth/Qwen3-32B-GGUF:Q8_0
code:sh
./build/bin/llama-cli -hf unsloth/Qwen3-32B-GGUF:Q8_0 --single-turn --n-gpu-layers 100 --split-mode layer --prompt "「銀河超特急の夜」というタイトルで宮沢賢治風の童話を書いてください。"
unsloth/Qwen3-30B-A3B-GGUF:Q8_0
code:sh
./build/bin/llama-cli -hf unsloth/Qwen3-30B-A3B-GGUF:Q8_0 --single-turn --n-gpu-layers 100 --split-mode layer --prompt "こんにちは"
unsloth/Qwen3-30B-A3B-GGUF:BF16
code:sh
./build/bin/llama-cli -hf unsloth/Qwen3-30B-A3B-GGUF:BF16 --single-turn --n-gpu-layers 100 --split-mode layer --prompt "こんにちは"
llama-bench
code:sh
./build/bin/llama-bench -p 0 -n 128,256,512 \
-m ~/.cache/llama.cpp/unsloth_Qwen3-32B-GGUF_Qwen3-32B-Q8_0.gguf \
-m ~/.cache/llama.cpp/unsloth_Qwen3-30B-A3B-GGUF_Qwen3-30B-A3B-Q8_0.gguf \
-m ~/.cache/llama.cpp/mmns_Qwen3-32B-F16.gguf \
-m ~/.cache/llama.cpp/mmns_Qwen3-30B-A3B-F16.gguf \
-m ~/.cache/llama.cpp/unsloth_DeepSeek-R1-Distill-Llama-70B-GGUF_DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
結果